In [1]:
import altair as alt
import eland as ed
import json
import numpy as np
import pandas as pd
from elasticsearch import Elasticsearch
alt.data_transformers.disable_max_rows()
Out[1]:
DataTransformerRegistry.enable('default')
In [2]:
df = pd.read_csv('../data/tirol_obituaries_deduped_weekly.csv', delimiter = ',')
In [3]:
df.head()
Out[3]:
district municipaly year week count
0 Kufstein Kirchbichl 2017 1 1
1 Kufstein Kirchbichl 2017 2 2
2 Kufstein Kirchbichl 2017 3 0
3 Kufstein Kirchbichl 2017 4 2
4 Kufstein Kirchbichl 2017 5 2
In [4]:
#http://www.statistik.at/wcm/idc/idcplg?IdcService=GET_NATIVE_FILE&RevisionSelectionMethod=LatestReleased&dDocName=122943
statistik_austria = pd.DataFrame({
    'week': [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1],
    'death_count': [160,153,152,136,130,128,137,123,117,119,132,133,138,136,123]
})
statistik_austria.head()
Out[4]:
week death_count
0 15 160
1 14 153
2 13 152
3 12 136
4 11 130

Outlier Detection

Running ./bin/parse.sh will create the source file for outlier detection: ./data/tirol_obituaries_deduped_weekly_outlier_detection_features.csv.

The columns of this file are: district,municipaly,year,week,count,yearly_max,weekly_max

yearly_max and weekly_max are the derived feature vectors. They are the difference between each rows count and the municipaly's maximum value of the row's week and the yearly maximum of that municipaly.

To calculate the outlier_score for each row, use the following configuration with Elasticsearch's Outlier Detection feature:

{
  "id": "tirol_outlier_high_count_2_1",
  "description": "",
  "source": {
    "index": [
      "tirol_outlier_high_count_2"
    ],
    "query": {
      "match_all": {}
    }
  },
  "dest": {
    "index": "tirol_outlier_high_count_dest_2_1",
    "results_field": "ml"
  },
  "analysis": {
    "outlier_detection": {
      "compute_feature_influence": true,
      "outlier_fraction": 0.05,
      "standardization_enabled": true
    }
  },
  "analyzed_fields": {
    "includes": [],
    "excludes": [
      "count",
      "week",
      "year"
    ]
  },
  "model_memory_limit": "2mb",
  "create_time": 1587934690989,
  "version": "8.0.0",
  "allow_lazy_start": false
}

To learn more about the feature head over to the docs here.

In [5]:
with open('elasticsearch-host', 'r') as file:
    es_host = file.read().replace('\n', '')

ed_outliers = ed.read_es(es_host, 'tirol_outlier_high_count_dest_2_1')
ed_outliers = ed_outliers[['district', 'municipaly', 'year', 'week', 'ml.outlier_score']]
df_outliers = ed.eland_to_pandas(ed_outliers)
df_outliers = df_outliers.rename(columns={'ml.outlier_score': 'outlier_score'})
df_outliers.to_csv('../data/tirol_obituaries_outliers_weekly.csv', index=False) 
df_outliers.head()
Out[5]:
district municipaly year week outlier_score
-4BHuHEBq9uwbHkG_5-_ Imst Sölden 2020 7 0.061268
-4BHuHEBq9uwbHkG_52_ Innsbruck-Land Oberperfuss 2020 5 0.061268
-4BHuHEBq9uwbHkG_56_ Innsbruck-Land Gnadenwald 2020 6 0.061268
-4BHuHEBq9uwbHkG_5a_ Kufstein Kufstein 2020 15 0.061268
-4BHuHEBq9uwbHkG_5e_ Kufstein Wildschönau 2020 16 0.061268
In [6]:
d_both = pd.merge(df, df_outliers, how='left', on=['district', 'municipaly', 'year', 'week'])
d_both.head()
Out[6]:
district municipaly year week count outlier_score
0 Kufstein Kirchbichl 2017 1 1 NaN
1 Kufstein Kirchbichl 2017 2 2 NaN
2 Kufstein Kirchbichl 2017 3 0 NaN
3 Kufstein Kirchbichl 2017 4 2 NaN
4 Kufstein Kirchbichl 2017 5 2 NaN
In [7]:
base = alt.Chart(df, width=470, height=200)

x_domain=(1,52)

week_cut_off='17'

week_point = base.transform_filter(
    'datum.year==2020 & datum.week<'+week_cut_off
).mark_circle(color='red', opacity=0, size=150).encode(
    alt.X('week:Q', scale=alt.Scale(domain=x_domain,zero=False,nice=False), title='', axis=alt.Axis(grid=False)),
    alt.Y('sum(count)', title=''),
    tooltip=[
        alt.Tooltip('week:O', title="Kalenderwoche"),
        alt.Tooltip('sum(count):Q', title="Todesanzeigen")
    ]
)

week_historic_area = base.transform_filter(
    'datum.year!=2020 && datum.year!=2016'
).transform_joinaggregate(
    week_count='sum(count)',
    groupby=['year', 'week']
).mark_area(color='lightgray', opacity=0.5, strokeJoin='round').encode(
    alt.X('week:Q', scale=alt.Scale(domain=x_domain,zero=False,nice=False), title='', axis=alt.Axis(grid=False)),
    alt.Y('max(week_count):Q', title=''),
    alt.Y2('min(week_count):Q', title='')
)

week_historic_mean = base.transform_filter(
    'datum.year!=2020 && datum.year!=2016'
).transform_joinaggregate(
    week_count='sum(count)',
    groupby=['year', 'week']
).mark_line(color='gray', opacity=0.25, strokeJoin='round').encode(
    alt.X('week:Q', scale=alt.Scale(domain=x_domain,zero=False,nice=False), title='', axis=alt.Axis(grid=False)),
    alt.Y('mean(week_count):Q', title='')
)

week_current_line = base.transform_filter(
    'datum.year==2020 & datum.week<'+week_cut_off
).mark_line(strokeJoin='round').encode(
    alt.X('week:Q', scale=alt.Scale(domain=x_domain,zero=False,nice=False), title='', axis=alt.Axis(grid=False)),
    alt.Y('sum(count):Q', title='', axis=alt.Axis(domainOpacity=0,ticks=False))
)

week_statistik_austria_line = alt.Chart(statistik_austria).mark_line(clip=True,color='orange', strokeJoin='round').encode(
    alt.X('week:Q', scale=alt.Scale(domain=x_domain,zero=False,nice=False), title='', axis=alt.Axis(grid=False)),
    alt.Y('death_count:Q', title='', axis=alt.Axis(domainOpacity=0,ticks=False)),
    tooltip=[
        alt.Tooltip('week:O', title="Kalenderwoche"),
        alt.Tooltip('death_count:Q', title="Todesfälle")
    ]
)

week_statistik_austria_line_80 = alt.Chart(statistik_austria).transform_calculate(
    death_count_adjusted='datum.death_count*0.8'
).mark_line(clip=True,color='#ffbb78', strokeJoin='round').encode(
    alt.X('week:Q', scale=alt.Scale(domain=x_domain,zero=False,nice=False), title='', axis=alt.Axis(grid=False)),
    alt.Y('death_count_adjusted:Q', title='', axis=alt.Axis(domainOpacity=0,ticks=False)),
    tooltip=[
        alt.Tooltip('week:O', title="Kalenderwoche"),
        alt.Tooltip('death_count_adjusted:Q', title="Todesfälle 80%")
    ]
)

chart_3_legendDomain = [
    'Statistik Austria 2020',
#     'Statistik Austria 2020 80%',
    'Todesanzeigen 2020',
    'Todesanzeigen 2017-2019 min/max',
    'Todesanzeigen 2017-2019 mean'
]
chart_3_legendColors = [
    'orange',
#     '#ffbb78',
    '#1f77b4',
    'lightgray',
    'gray'
]
chart_3_legendData = pd.DataFrame({
    'label': chart_3_legendDomain
})

legend3 = alt.Chart(chart_3_legendData).mark_square(size=150).encode(
    y=alt.Y(
        'label:N',
        axis=alt.Axis(domain=False, ticks=False, orient='right'),
        title=None
    ),
    color=alt.Color(
        'label',
        scale=alt.Scale(domain=chart_3_legendDomain,range=chart_3_legendColors),
        legend=None
    )
)

vega_week = ((
    week_historic_area +
    week_historic_mean +
    week_statistik_austria_line +
#     week_statistik_austria_line_80 +
    week_current_line +
    week_point
) | legend3).properties(
    title={
      "text": ["Todesanzeigen Tirol pro Woche im Jahresverlauf"],
    }
).configure_axis(
    grid=True,
    gridColor="#eee",
    domainColor="#ddd",
    tickColor="#ddd",
    labelColor="gray",
    labelBound=True,
).configure_view(
    strokeWidth=0
).configure_title(
    fontSize=14,
    fontWeight='bold',
    anchor='start',
    color='gray'
)

vega_week
Out[7]:
In [8]:
def district_chart(district='Kufstein'):
    d_df = df[df['district']==district]

    d_base = alt.Chart(d_df).properties(
        width=220,
        height=125
    )

    x_domain=(1,52)
    y_domain=(0,40)

    d_week_historic_area = d_base.transform_filter(
        'datum.year!=2020 && datum.year!=2016'
    ).mark_area(color='lightgray', opacity=0.5, strokeJoin='round').transform_joinaggregate(
        week_count='sum(count)',
        groupby=['year', 'week']
    ).encode(
        alt.X('week:Q', scale=alt.Scale(domain=x_domain,zero=False,nice=False), title='', axis=alt.Axis(grid=False)),
        alt.Y('max(week_count):Q', title=''),
        alt.Y2('min(week_count):Q', title='')
    )

    d_week_historic_mean = d_base.transform_filter(
        'datum.year!=2020 && datum.year!=2016'
    ).mark_line(color='gray', opacity=0.25, strokeJoin='round').transform_joinaggregate(
        week_count='sum(count)',
        groupby=['year', 'week']
    ).encode(
        alt.X('week:Q', scale=alt.Scale(domain=x_domain,zero=False,nice=False), title='', axis=alt.Axis(grid=False)),
        alt.Y('mean(week_count):Q', title='')
    )

    d_week_current_line = d_base.transform_filter(
        'datum.year==2020 & datum.week<'+week_cut_off
    ).mark_line(strokeJoin='round').encode(
        alt.X('week:Q', scale=alt.Scale(domain=x_domain,zero=False,nice=False), title='', axis=alt.Axis(grid=False)),
        alt.Y('sum(count):Q', title='', axis=alt.Axis(domainOpacity=0,ticks=False))
    )

    d_week_point = d_base.transform_filter(
        'datum.year==2020 & datum.week<'+week_cut_off
    ).mark_circle(color='gray', opacity=0, size=150).encode(
        alt.X('week:Q', scale=alt.Scale(domain=x_domain,zero=False,nice=False), title='', axis=alt.Axis(grid=False)),
        alt.Y('sum(count):Q', scale=alt.Scale(domain=y_domain,zero=True,nice=False), title=''),
        tooltip=[
            alt.Tooltip('week:O', title="Kalenderwoche"),
            alt.Tooltip('sum(count):Q', title="Todesanzeigen")
        ]
    )

    return (
        d_week_historic_area +
        d_week_historic_mean +
        d_week_current_line +
        d_week_point
    ).properties(
        title={
          "text": [district],
        }
    )

d_chart_3_legendDomain = [
    'Todesanzeigen 2020',
    'Todesanzeigen 2017-2019 min/max',
    'Todesanzeigen 2017-2019 mean'
]
d_chart_3_legendColors = ['#1f77b4', 'lightgray', 'gray']
d_chart_3_legendData = pd.DataFrame({
    'label': d_chart_3_legendDomain
})


d_legend3 = alt.Chart(d_chart_3_legendData).mark_square(size=150).encode(
    y=alt.Y(
        'label:N',
        axis=alt.Axis(domain=False, ticks=False, orient='right'),
        title=None
    ),
    color=alt.Color(
        'label',
        scale=alt.Scale(domain=d_chart_3_legendDomain,range=d_chart_3_legendColors),
        legend=None
    )
)

vega_week_district = (
    (
         district_chart('Imst') | district_chart('Innsbruck-Stadt') | district_chart('Innsbruck-Land')
    ) & (
        district_chart('Kitzbühel') | district_chart('Kufstein') | district_chart('Landeck')
    ) & (
        district_chart('Lienz') | district_chart('Reutte') | district_chart('Schwaz')
    ) | d_legend3
).configure_axis(
        grid=True,
        gridColor="#eee",
        domainColor="#ddd",
        tickColor="#ddd",
        labelColor="gray",
        labelBound=True,
    ).configure_view(
        strokeWidth=0
    ).configure_title(
        fontSize=14,
        fontWeight='normal',
        anchor='start',
        color='gray'
    )

vega_week_district
Out[8]:
In [9]:
def municipaly_chart(district='Kufstein'):
    d_both_district = d_both[d_both['district']==district]

    d_base = alt.Chart(d_both_district).properties(
        width=150,
        height=60
    )

    x_domain=(1,52)
    y_domain=(0,20)

    d_week_historic_area = d_base.transform_filter(
        'datum.year!=2020 && datum.year!=2016'
    ).transform_joinaggregate(
        week_count='sum(count)',
        groupby=['week','year']
    ).transform_calculate(
        week_count_min='min(datum.week_count)',
        week_count_max='max(datum.week_count)'
    ).mark_area(color='lightgray', opacity=0.5, strokeJoin='round').encode(
        alt.X('week:Q', scale=alt.Scale(domain=x_domain,zero=False,nice=False), title='', axis=alt.Axis(grid=False)),
        alt.Y('max(week_count):Q', title=''),
        alt.Y2('min(week_count):Q', title='')
    )

    d_week_historic_mean = d_base.transform_filter(
        'datum.year!=2020 && datum.year!=2016'
    ).transform_joinaggregate(
        week_count='sum(count)',
        groupby=['week','year']
    ).mark_line(color='gray', opacity=0.25, strokeJoin='round').encode(
        alt.X('week:Q', scale=alt.Scale(domain=x_domain,zero=False,nice=False), title='', axis=alt.Axis(grid=False)),
        alt.Y('mean(week_count):Q', title='')
    )

    d_week_current_line = d_base.transform_filter(
        'datum.year==2020 & datum.week<'+week_cut_off
    ).transform_joinaggregate(
        week_count='sum(count)',
        groupby=['week','year']
    ).mark_line(strokeJoin='round').encode(
        alt.X('week:Q', scale=alt.Scale(domain=x_domain,zero=False,nice=False), title='', axis=alt.Axis(grid=False)),
        alt.Y('week_count:Q', title='', axis=alt.Axis(domainOpacity=0,ticks=False))
    )

    d_week_point = d_base.transform_filter(
        'datum.year==2020 & datum.week<'+week_cut_off
    ).mark_circle(color='gray', opacity=0, size=150).encode(
        alt.X('week:Q', scale=alt.Scale(domain=x_domain,zero=False,nice=False), title='', axis=alt.Axis(grid=False)),
        alt.Y('sum(count):Q', scale=alt.Scale(zero=True), title=''),
        tooltip=[
            alt.Tooltip('week:O', title="Kalenderwoche"),
            alt.Tooltip('sum(count):Q', title="Todesanzeigen"),
            alt.Tooltip('outlier_score:Q', title="Outlier-Score", format='.2%')
        ]
    )
    
    d_week_outlier = d_base.transform_filter(
        'datum.year==2020 & datum.week<'+week_cut_off+' & datum.outlier_score>=0.8'
    ).transform_joinaggregate(
        week_count='sum(count)',
        outlier_score_max='max(outlier_score)',
        groupby=['week','year']
    ).mark_point(color='red', opacity=.4, size=60, strokeWidth=3).encode(
        alt.X('week:Q', scale=alt.Scale(domain=x_domain,zero=False,nice=False), title='', axis=alt.Axis(grid=False)),
        alt.Y('week_count:Q', scale=alt.Scale(zero=True), title='')
    )

    return (
        d_week_historic_area +
        d_week_historic_mean +
        d_week_current_line +
        d_week_outlier +
        d_week_point
    ).facet(
        facet=alt.Facet('municipaly:N', title=None, header=alt.Header(
            labelAnchor='start',
            labelOrient='bottom',
            labelPadding=0
        )),
        columns=5,
        padding=0,
        title={
          "text": ["Todesanzeigen im Bezirk "+district+" pro Gemeinde pro Woche im Jahresverlauf"], 
          "subtitle": [
    #               "Durchschnitt pro Monat in Hellblau",
              "Quelle: https://github.com/walterra/covid-19-tirol-ds"
          ],
          "color": "black",
          "subtitleColor": "gray"
        },
    ).configure_axis(
        grid=True,
        gridColor="#eee",
        domainColor="#ddd",
        tickColor="#ddd",
        labelColor="gray",
        labelBound=True,
    ).configure_view(
        strokeWidth=0
    ).configure_title(
        fontSize=14,
        fontWeight='bold',
        anchor='start',
        color='gray'
    )

d_chart_3_legendDomain = [
    'Todesanzeigen 2020',
    'Todesanzeigen 2017-2019 min/max',
    'Todesanzeigen 2017-2019 mean'
]
d_chart_3_legendColors = ['#1f77b4', 'lightgray', 'gray']
d_chart_3_legendData = pd.DataFrame({
    'label': d_chart_3_legendDomain
})


d_legend3 = alt.Chart(d_chart_3_legendData).mark_square(size=150).encode(
    y=alt.Y(
        'label:N',
        axis=alt.Axis(domain=False, ticks=False, orient='right'),
        title=None
    ),
    color=alt.Color(
        'label',
        scale=alt.Scale(domain=d_chart_3_legendDomain,range=d_chart_3_legendColors),
        legend=None
    )
)

vega_week_municipaly = municipaly_chart('Kufstein')

vega_week_municipaly
Out[9]:
In [10]:
# save chart_1 as formatted JSON
with open("../docs/data/vega_week.json", "w") as jsonFile:
    json.dump(json.loads(vega_week.to_json()), jsonFile, indent=4, sort_keys=True)
In [11]:
# save chart_1 as formatted JSON
with open("../docs/data/vega_week_district.json", "w") as jsonFile:
    json.dump(json.loads(vega_week_district.to_json()), jsonFile, indent=4, sort_keys=True)
In [12]:
districts = {
    "imst": "Imst",
    "innsbruck-land": "Innsbruck-Land",
    "innsbruck-stadt": "Innsbruck-Stadt",
    "kitzbuehel": "Kitzbühel",
    "kufstein": "Kufstein",
    "landeck": "Landeck",
    "reutte": "Reutte",
    "schwaz": "Schwaz",
    "lienz": "Lienz"
}

for d in districts:
    chart = municipaly_chart(districts[d])
    with open("../docs/data/vega_" + d + ".json", "w") as jsonFile:
        json.dump(json.loads(chart.to_json()), jsonFile, indent=4, sort_keys=True)